# download_eblip_issue.py
# EBLIP (Evidence Based Library and Information Practice) Downloader
# Automates downloading PDFs from EBLIP (OJS 3.3 platform)
# - Parses issue Table of Contents
# - Extracts article titles and /download/ PDF URLs
# - Skips Editorial and News Section items
# - Creates dynamic folders based on Vol/Issue/Year
# - Logs all downloads into a CSV file

import os
import re
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# ---------- Helpers ----------
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)

def extract_volume_issue_year(title_text):
    # Example: Vol. 15 No. 1 (2020) | Evidence Based Library and Information Practice
    vol_match = re.search(r'Vol\.\s*(\d+)', title_text, re.I)
    issue_match = re.search(r'No\.\s*(\d+)', title_text, re.I)
    year_match = re.search(r'\((\d{4})\)', title_text)
    vol = vol_match.group(1) if vol_match else "Vol"
    issue = issue_match.group(1) if issue_match else "Issue"
    year = year_match.group(1) if year_match else "Year"
    return f"EBLIP_Vol{vol}_Issue{issue}_{year}"

# ---------- Input ----------
issue_url = input("Enter EBLIP issue URL: ").strip()

# ---------- Fetch Issue Page ----------
print(f"[INFO] Fetching issue page: {issue_url}")
resp = requests.get(issue_url)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")

# ---------- Folder Naming ----------
title_tag = soup.find("title")
if title_tag:
    folder_name = extract_volume_issue_year(title_tag.get_text())
else:
    folder_name = "EBLIP_Issue"
os.makedirs(folder_name, exist_ok=True)

# ---------- Find Articles ----------
articles = soup.find_all("div", class_="obj_article_summary")
print(f"[INFO] Found {len(articles)} article blocks")

log_path = os.path.join(folder_name, f"{folder_name}_log.csv")
log_file = open(log_path, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(log_file)
csv_writer.writerow(["Title", "Article URL", "PDF URL", "Status"])

count = 0

for art in articles:
    title_tag = art.find("h3", class_="title")
    if not title_tag or not title_tag.a:
        continue
    title = title_tag.a.get_text(strip=True)
    article_url = title_tag.a["href"]

    # Skip Editorials and News
    if re.search(r'Editorial|News', title, re.I):
        print(f"[SKIP] {title}")
        csv_writer.writerow([title, article_url, "", "Skipped (Editorial/News)"])
        continue

    # Find PDF link
    pdf_link = None
    for a in art.find_all("a", class_="obj_galley_link"):
        if "pdf" in a.get_text(strip=True).lower():
            pdf_link = urljoin(issue_url, a["href"])
            break

    if not pdf_link:
        print(f"[SKIP] No PDF found for: {title}")
        csv_writer.writerow([title, article_url, "", "No PDF"])
        continue

    # Fix PDF link to direct /download/ if it's a /view/
    pdf_link = pdf_link.replace("/view/", "/download/")

    # Download PDF
    try:
        clean_title = sanitize_filename(title)
        pdf_path = os.path.join(folder_name, f"{clean_title}.pdf")
        print(f"[{count+1}] Downloading: {clean_title}")
        r = requests.get(pdf_link)
        r.raise_for_status()
        with open(pdf_path, "wb") as f:
            f.write(r.content)
        csv_writer.writerow([title, article_url, pdf_link, "OK"])
        count += 1
    except Exception as e:
        print(f"[ERROR] Failed: {title} - {e}")
        csv_writer.writerow([title, article_url, pdf_link, f"Error: {e}"])

log_file.close()
print(f"\nDone! {count} PDFs saved in {folder_name}")
print(f"Log file created: {log_path}")
